%matplotlib inline
import numpy as np
import pandas as pd
stu_adm = pd.read_csv('ds/student_admission106.csv', encoding="utf-8", dtype=str)
uname = pd.read_csv('ds/univ_name106short1.csv', encoding="utf-8", dtype=str)
all_depid = stu_adm['department_id'].unique()
all_stuid = stu_adm['student_id'].unique()
ndepid = all_depid.shape[0]
nstuid = all_stuid.shape[0]
print("There are %d students and %d departments in total." % (nstuid, ndepid))
print("offers received by students:")
stu_adm.head(10)
print("academic department basic information:")
uname.head(10)
'''
Q1.1: Report the number of academic department and student applicants in your dataset.
Q1.2: Report the top ten departments that recieved the most applications and the number of applications they received. Identify the department by their department_id and names.
'''
stu = stu_adm.copy()
old = stu.shape
new = []
while(old != new):
# student >= 10
row = stu.groupby('department_id').count()
row = row[row.student_id >= 10]
row = list(row.index)
stu = stu[stu.department_id.isin(row)]
old = stu.shape
# application > 1
col = stu.groupby('student_id').count()
col = col[col.department_id > 1]
col = list(col.index)
stu = stu[stu.student_id.isin(col)]
new = stu.shape
uname['combine'] = uname['school_name_abbr'].map(str) + uname['department_name_abbr']
todo = pd.crosstab(index=stu['department_id'], columns = stu['student_id'])
todoDf = todo.merge(uname, how = 'left', left_on='department_id', right_on='department_id')
todoMatrix = np.matrix(todo)
todoMatrix.shape
print('number of academic department:', todoMatrix.shape[0])
print('number of student applicants:', todoMatrix.shape[1])
# top ten department
topten = stu.groupby('department_id').count()
topten = topten.sort_values(by='student_id', ascending=False)
print('top ten departments:')
for d in topten['student_id'][:10].index:
print('department_id:', d,' department_name:', uname[uname.department_id==d].iloc[0]['department_name'])
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
myfont = FontProperties(fname=r'C:\Windows\Fonts\msjh.ttc')
def draw(df, method):
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('1st component', fontsize = 15)
ax.set_ylabel('2nd component', fontsize = 15)
ax.set_title(method, fontsize = 20)
targets = df['category_name'].unique()
for target in targets:
indicesToKeep = df['category_name'] == target
ax.scatter(df.loc[indicesToKeep, 'pc1']
, df.loc[indicesToKeep, 'pc2']
, s = 50)
ax.legend(targets, prop=myfont)
ax.grid()
'''Q2.1: Visualize academic departments using the first 8 principle components'''
from sklearn.decomposition import PCA
pc = PCA(n_components=8).fit_transform(todoMatrix)
principalDf = pd.DataFrame(data = pc, columns = ['pc1', 'pc2', 'pc3', 'pc4', 'pc5', 'pc6', 'pc7', 'pc8'])
principalDf_final = pd.concat([principalDf, todoDf['department_id'], todoDf['category_name']], axis = 1)
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('pc1', fontsize = 15)
ax.set_ylabel('pc2', fontsize = 15)
ax.set_title('PCA', fontsize = 20)
targets = principalDf_final['category_name'].unique()
for target in targets:
indicesToKeep = principalDf_final['category_name'] == target
ax.scatter(principalDf_final.loc[indicesToKeep, 'pc1']
, principalDf_final.loc[indicesToKeep, 'pc2']
, s = 50)
ax.legend(targets,prop=myfont)
ax.grid()
'''
1st principal component和2nd principal component明顯地將'醫藥衛生及社會福利'與'工程、製造及營建'區分出來
'''
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('pc4', fontsize = 15)
ax.set_ylabel('pc8', fontsize = 15)
ax.set_title('PCA', fontsize = 20)
targets = principalDf_final['category_name'].unique()
for target in targets:
indicesToKeep = principalDf_final['category_name'] == target
ax.scatter(principalDf_final.loc[indicesToKeep, 'pc4']
, principalDf_final.loc[indicesToKeep, 'pc8']
, s = 50)
ax.legend(targets, prop=myfont)
ax.grid()
'''
4th principal component和8th principal component將'商業、管理及法律'、'藝術及人文'與'工程、製造及營建'區分出來
'''
'''Q2.2: Visualize academic department using multiple dimensional scaling'''
from sklearn.manifold import MDS
# metric setting
embedding = MDS(n_components=2)
todo_mds = embedding.fit_transform(todoMatrix)
mdsDf = pd.DataFrame(data = todo_mds, columns = ['pc1', 'pc2'])
mdsDf_final = pd.concat([mdsDf, todoDf['department_id'], todoDf['category_name']], axis = 1)
draw(mdsDf_final, 'metric MDS')
# non-metric setting
embedding2 = MDS(n_components=2, metric=False)
todo_nmds = embedding2.fit_transform(todoMatrix)
nmdsDf = pd.DataFrame(data = todo_nmds, columns = ['pc1', 'pc2'])
nmdsDf_final = pd.concat([nmdsDf, todoDf['department_id'], todoDf['category_name']], axis = 1)
draw(nmdsDf_final, 'Non-metric MDS')
'''
該資料集不適用metric MDS及non-metric MDS方法,所有系所類別顏色點重疊情況多,效果不好,non-metric又比metric分得更不清楚
'''
'''
Q2.3: Visualize academic department using Locally Linear Embedding. Consider three variations:
(1) Use 20 neighbors to construct the weight matrix;
(2) Use 40 neighbors to construct the weight matrix;
(3) Perform PCA transformation first, and use the first 100 principle components as the input to LLE (with 20 neighbors).
'''
from sklearn.manifold import LocallyLinearEmbedding
todo_lle1 = LocallyLinearEmbedding(n_neighbors=20, n_components=2).fit_transform(todoMatrix)
lle1Df = pd.DataFrame(data = todo_lle1, columns = ['pc1', 'pc2'])
lle1Df_final = pd.concat([lle1Df, todoDf['department_id'], todoDf['category_name']], axis = 1)
draw(lle1Df_final, 'LLE_20neighbors')
from sklearn.manifold import LocallyLinearEmbedding
todo_lle2 = LocallyLinearEmbedding(n_neighbors=40, n_components=2).fit_transform(todoMatrix)
lle2Df = pd.DataFrame(data = todo_lle2, columns = ['pc1', 'pc2'])
lle2Df_final = pd.concat([lle2Df, todoDf['department_id'], todoDf['category_name']], axis = 1)
draw(lle2Df_final, 'LLE_40neighbors')
pca = PCA(n_components=100).fit_transform(todoMatrix)
todo_lle3 = LocallyLinearEmbedding(n_neighbors=20, n_components=2).fit_transform(pca)
lle3Df = pd.DataFrame(data = todo_lle3, columns = ['pc1', 'pc2'])
lle3Df_final = pd.concat([lle3Df, todoDf['department_id'], todoDf['category_name']], axis = 1)
draw(lle3Df_final, 'LLE_20neighbors with PCA')
'''
先用PCA降成100維的Locally Linear Embedding方法並沒有顯著比沒先用PCA好,
20 neighbors比40 neighbors表現好,可能是因為鄰居40個分太細
'''
'''
Q2.4: Visualize academic department using Kernel PCA. You should at least consider the RBF and Cosine kernel.
It is your responsibility to select reasonably good kernel parameters.
'''
from sklearn.decomposition import KernelPCA
transformer = KernelPCA(n_components=2, kernel='rbf')
todo_rbf = transformer.fit_transform(todoMatrix)
rbfDf = pd.DataFrame(data = todo_rbf, columns = ['pc1', 'pc2'])
rbfDf_final = pd.concat([rbfDf, todoDf['department_id'], todoDf['category_name']], axis = 1)
draw(rbfDf_final,'RBF Kernal PCA')
transformer = KernelPCA(n_components=2, kernel='cosine')
todo_cos = transformer.fit_transform(todoMatrix)
cosDf = pd.DataFrame(data = todo_cos, columns = ['pc1', 'pc2'])
cosDf_final = pd.concat([cosDf, todoDf['department_id'], todoDf['category_name']], axis = 1)
draw(cosDf_final, 'Cosine Kernal PCA')
'''
RBF kernel PCA 明顯將'醫藥衛生及社會福利'和'工程、製造及營建'區分出來,對該資料集而言,效果與前述的PCA差不多
RBF和Cosine kernel PCA 沒有明顯優劣
'''
'''
Q2.5: Visualize academic department using t-SNE. You should consider at least the Euclidian, Cosine, and Jaccard metric.
Set numpy random seed so that your results can be repeated.
'''
from sklearn.manifold import TSNE
np.random.seed(5)
todo_tsne_euc = TSNE(n_components=2,metric='euclidean',random_state=10).fit_transform(todoMatrix)
tsne_eucDf = pd.DataFrame(data = todo_tsne_euc, columns = ['pc1', 'pc2'])
tsne_eucDf_final = pd.concat([tsne_eucDf, todoDf['department_id'], todoDf['category_name']], axis = 1)
draw(tsne_eucDf_final, 't-SNE_euclidian')
todo_tsne_cos = TSNE(n_components=2,metric='cosine',random_state=10).fit_transform(todoMatrix)
tsne_cosDf = pd.DataFrame(data = todo_tsne_cos, columns = ['pc1', 'pc2'])
tsne_cosDf_final = pd.concat([tsne_cosDf, todoDf['department_id'], todoDf['category_name']], axis = 1)
draw(tsne_cosDf_final, 't-SNE_cosine')
todo_tsne_jac = TSNE(n_components=2,metric='jaccard',random_state=10).fit_transform(todoMatrix)
tsne_jacDf = pd.DataFrame(data = todo_tsne_jac, columns = ['pc1', 'pc2'])
tsne_jacDf_final = pd.concat([tsne_jacDf, todoDf['department_id'], todoDf['category_name']], axis = 1)
draw(tsne_jacDf_final, 't-SNE_jaccard')
'''
euclidian t-SNE將大部分資料都集中在一起,沒有明顯區隔,效果差
cosine t-SNE將資料區分得較清楚,顏色區塊明顯
jaccard t-SNE也能看出明顯的資料區別,但稍微較cosine差一點,jaccard資料點較密集,cosine較分散
'''
'''Q3 Refine'''
'''choose Cosine t-SNE'''
tsne_cosDf_Q3 = pd.concat([tsne_cosDf_final, todoDf['combine']], axis = 1)
fig = plt.figure(figsize = (70,70))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('pc1', fontsize = 15)
ax.set_ylabel('pc2', fontsize = 15)
ax.set_title('Cosine t-SNE', fontsize = 20)
targets = tsne_cosDf_Q3['category_name'].unique()
for target in targets:
indicesToKeep = tsne_cosDf_Q3['category_name'] == target
x = tsne_cosDf_Q3.loc[indicesToKeep, 'pc1']
y = tsne_cosDf_Q3.loc[indicesToKeep, 'pc2']
ax.scatter(x, y, s = 50)
for i in range(len(tsne_cosDf_Q3)):
plt.text(tsne_cosDf_Q3.iloc[i][0]+0.3, tsne_cosDf_Q3.iloc[i][1]+0.3, tsne_cosDf_Q3.iloc[i]['combine'], fontsize=3, fontproperties=myfont)
ax.legend(targets, prop=myfont)
ax.grid()
plt.savefig('hw4.jpg', dpi=300)
'''
使用cosine t-SNE資料區分得較清楚,分得比較開,將圖放大後可以發現相似系所名稱的被分在一起
'''